##Packages
#install.packages('tidyverse')
#install.packages('dplyr')
#install.packages('car')
#install.packages("DescTools")
#install.packages("arules")
#install.packages("Rtools")
#install.packages("stats")
#install.packages("forecast")
#install.packages("TSstudio")
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(DescTools)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:arules':
##
## intersect, recode, setdiff, setequal, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
##
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
##
## expand, pack, unpack
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:DescTools':
##
## Recode
## The following object is masked from 'package:arules':
##
## recode
library(stats)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:arules':
##
## intersect, setdiff, union
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
##
## Attaching package: 'forecast'
## The following object is masked from 'package:DescTools':
##
## BoxCox
library(tseries)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:DescTools':
##
## MAE, RMSE
library(TSstudio)
cancer_incidence<- read.csv(file="C:/Users/olivi/OneDrive/Documents/DataSets/13100109-eng/13100109.csv", sep = ",")
#first remove columns from cancer incidence frame
##############################################################
cancer_table<- subset(cancer_incidence, select = -c(DGUID,UOM_ID,UOM,SCALAR_FACTOR,SCALAR_ID, VECTOR, COORDINATE, STATUS, SYMBOL, TERMINATED, DECIMALS))
#now remove all rows that contain the characteristic value that we don't want to measure
cancer_cl<-cancer_table[-grep("Statistically different from", cancer_table$Characteristics),]
cancer_cl<- cancer_cl[grep("age-standardized", cancer_cl$Characteristics),]
cancer_cl<-cancer_cl[-grep("95%", cancer_cl$Characteristics),]
#cancer_cl<-cancer_cl[grep("Both", cancer_cl$Sex),]
#cancer_cl<-cancer_cl[grep("All invasive", cancer_cl$Selected.sites.of.cancer..ICD.O.3.),]
#try spread
#remove duplicated elements so spread works
cancer_w<-cancer_cl[!duplicated(cancer_cl),]
cancer_w4<-spread(cancer_w, Characteristics, VALUE)
cancer_w4<- cancer_w4[-grep("Canada", cancer_w4$GEO),]
cancer_w4<- cancer_w4[-grep("Peer", cancer_w4$GEO),]
n_distinct(cancer_cl$GEO)
## [1] 154
##154
summary(cancer_w4)
## ï..REF_DATE GEO Sex
## Length:19162 Length:19162 Length:19162
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Selected.sites.of.cancer..ICD.O.3.
## Length:19162
## Class :character
## Mode :character
##
##
##
##
## Cancer incidence (age-standardized rate per 100,000 population)
## Min. : 0.0
## 1st Qu.: 68.6
## Median : 97.3
## Mean : 211.2
## 3rd Qu.: 439.5
## Max. :1295.7
## NA's :1049
str(cancer_w4)
## 'data.frame': 19162 obs. of 5 variables:
## $ ï..REF_DATE : chr "2001/2003" "2001/2003" "2001/2003" "2001/2003" ...
## $ GEO : chr "Alberta" "Alberta" "Alberta" "Alberta" ...
## $ Sex : chr "Both sexes" "Both sexes" "Both sexes" "Females" ...
## $ Selected.sites.of.cancer..ICD.O.3. : chr "All invasive primary cancer sites (including in situ bladder)" "Bronchus and lung cancer [C34.0-C34.9]" "Colon, rectum and rectosigmoid junction cancer [C18.0-C18.9, C19.9, C20.9, C26.0]" "All invasive primary cancer sites (including in situ bladder)" ...
## $ Cancer incidence (age-standardized rate per 100,000 population): num 543.8 68.9 63.2 467.9 57.7 ...
hist(cancer_w4$`Cancer incidence (age-standardized rate per 100,000 population)`)
cancer_2<-cancer_table[-grep("Statistically different from", cancer_table$Characteristics),]
cancer_2<-cancer_2[-grep("95%", cancer_2$Characteristics),]
cancer_2<-cancer_2[grep("age-standardized", cancer_2$Characteristics),]
cancer_2<-cancer_2[-grep("Canada", cancer_2$GEO),]
cancer_2<-cancer_2[-grep("Peer", cancer_2$GEO),]
cancer_2<- cancer_2[!duplicated(cancer_2),]
cancer_2.1<- spread(cancer_2, Characteristics, VALUE)
cancer_2.2<- cancer_2.1[,1:5]
cancer_2.2<- spread(cancer_2.2, Selected.sites.of.cancer..ICD.O.3., 'Cancer incidence (age-standardized rate per 100,000 population)')
cancer_2.2<-as.data.frame(unclass(cancer_2.2), stringsAsFactors = TRUE)
#levels(cancer_2.2$GEO)
hist(cancer_2.2[,4], breaks = 50, main = "All cancer sites")
hist(cancer_2.2[,5], breaks = 50, main = "Bronchus and lung cancer")
hist(cancer_2.2[,6], breaks = 50, main = "Colon cancer")
hist(cancer_2.2[,7], breaks = 50, main = "Female breast cancer")
hist(cancer_2.2[,8], breaks = 25, main = "Prostate cancer")
cancer_2.3<- cancer_2.2[,1:3]
cancer_2.3["Quantile_All"]<- CutQ(cancer_2.2[,4], breaks = 6, na.rm=FALSE)
cancer_2.3["Quantile_bronc"]<- CutQ(cancer_2.2[,5], breaks = 6, na.rm=FALSE)
cancer_2.3["Quantile_colon"]<- CutQ(cancer_2.2[,6], breaks = 6, na.rm=FALSE)
cancer_2.3["Quantile_breast"]<- CutQ(cancer_2.2[,7],breaks = 6, na.rm=FALSE)
cancer_2.3["Quantile_prost"]<- CutQ(cancer_2.2[,8], breaks = 6, na.rm =FALSE)
##not sure if apriori would work with this data, haven't figured it out yet
#when I had results none were pertaining to geography
geo<- list(cancer_2.3$GEO)
#assco_rules<- apriori(cancer_2.3, parameter = list(supp = 0.01, conf = 0.05), appearance = list(lhs=geo))
#summary(assco_rules)
#inspect(assco_rules)
#######SPLIT NEW TABLE INTO GEOGRAPHIC REGIONS
##with quantiles
cancer_ON.3<- cancer_2.3[grep("Ontario", cancer_2.3$GEO),]
cancer_QC.3<- cancer_2.3[grep("Quebec", cancer_2.3$GEO),]
cancer_BC.3<- cancer_2.3[grep("British Columbia", cancer_2.3$GEO),]
cancer_MB.3<- cancer_2.3[grep("Manitoba", cancer_2.3$GEO),]
cancer_SK.3<- cancer_2.3[grep("Saskatchewan", cancer_2.3$GEO),]
cancer_AB.3<- cancer_2.3[grep("Alberta", cancer_2.3$GEO),]
cancer_midwest.3<- rbind(cancer_MB.3,cancer_AB.3,cancer_SK.3)
cancer_NL.3<- cancer_2.3[grep("Newfoundland", cancer_2.3$GEO),]
cancer_NB.3<- cancer_2.3[grep("New Brunswick", cancer_2.3$GEO),]
cancer_NS.3<- cancer_2.3[grep("Nova Scotia", cancer_2.3$GEO),]
cancer_PEI.3<- cancer_2.3[grep("Prince Edward Island",cancer_2.3$GEO),]
cancer_maritimes.3<- rbind(cancer_NS.3,cancer_NB.3,cancer_NL.3,cancer_PEI.3)
cancer_YT.3<- cancer_2.3[grep("Yukon", cancer_2.3$GEO),]
cancer_NWT.3<-cancer_2.3[grep("Northwest Territories", cancer_2.3$GEO),]
cancer_NVT.3<-cancer_2.3[grep("Nunavut", cancer_2.3$GEO),]
cancer_territories.3<- rbind(cancer_NWT.3,cancer_YT.3,cancer_NVT.3)
##continuous
cancer_ON.2<- cancer_2.2[grep("Ontario", cancer_2.2$GEO),]
cancer_QC.2<- cancer_2.2[grep("Quebec", cancer_2.2$GEO),]
cancer_BC.2<- cancer_2.2[grep("British Columbia", cancer_2.2$GEO),]
cancer_MB.2<- cancer_2.2[grep("Manitoba", cancer_2.2$GEO),]
cancer_SK.2<- cancer_2.2[grep("Saskatchewan", cancer_2.2$GEO),]
cancer_AB.2<- cancer_2.2[grep("Alberta", cancer_2.2$GEO),]
cancer_midwest.2<- rbind(cancer_MB.2,cancer_AB.2,cancer_SK.2)
cancer_NL.2<- cancer_2.2[grep("Newfoundland", cancer_2.2$GEO),]
cancer_NB.2<- cancer_2.2[grep("New Brunswick", cancer_2.2$GEO),]
cancer_NS.2<- cancer_2.2[grep("Nova Scotia", cancer_2.2$GEO),]
cancer_PEI.2<- cancer_2.2[grep("Prince Edward Island",cancer_2.2$GEO),]
cancer_maritimes.2<- rbind(cancer_NS.2,cancer_NB.2,cancer_NL.2,cancer_PEI.2)
cancer_YT.2<- cancer_2.2[grep("Yukon", cancer_2.2$GEO),]
cancer_NWT.2<-cancer_2.2[grep("Northwest Territories", cancer_2.2$GEO),]
cancer_NVT.2<-cancer_2.2[grep("Nunavut", cancer_2.2$GEO),]
cancer_territories.2<- rbind(cancer_NWT.2,cancer_YT.2,cancer_NVT.2)
##try time series analysis
###Just a peek
ts_ON.2<-ts(data = cancer_ON.2, frequency = n_distinct(cancer_ON.2$GEO) ,start = 2001, end = 2015)
ts.plot(ts_ON.2)
ts_QC.2<-ts(data = cancer_QC.2, frequency = n_distinct(cancer_QC.2$GEO) ,start = 2001, end = 2015)
ts.plot(ts_QC.2)
ts_BC.2<-ts(data = cancer_BC.2, frequency = n_distinct(cancer_BC.2$GEO) ,start = 2001, end = 2015)
ts.plot(ts_BC.2)
ts_midwest.2<-ts(data = cancer_midwest.2, frequency = n_distinct(cancer_midwest.2$GEO) ,start = 2001, end = 2015)
ts.plot(ts_midwest.2)
ts_maritimes.2<-ts(data = cancer_maritimes.2,frequency = n_distinct(cancer_maritimes.2$GEO) , start = 2001, end = 2015)
ts.plot(ts_maritimes.2)
ts_territories.2<-ts(data = cancer_territories.2, frequency = n_distinct(cancer_territories.2$GEO) ,start = 2001, end = 2015)
ts.plot(ts_territories.2)
class(ts_BC.2)
## [1] "mts" "ts" "matrix"
####SO the frequency used is the number of distinct values in the GEO feature.
####This is because this is the "interval" that repeats over time.
##"Argument frequency indicates the sampling frequency of the time series, with the default value 1 indicating one sample in each unit time interval."
plot(decompose(ts_ON.2[,4]))
plot(decompose(ts_ON.2[,5]))
plot(decompose(ts_ON.2[,6]))
#plot(decompose(ts_ON.2[,7], na.action(na.pass)))
#plot(decompose(ts_ON.2[,8],na.action(na.pass)))
plot(decompose(ts_QC.2[,4]))
plot(decompose(ts_QC.2[,5]))
plot(decompose(ts_QC.2[,6]))
plot(decompose(ts_BC.2[,4]))
plot(decompose(ts_BC.2[,5]))
plot(decompose(ts_BC.2[,6]))
plot(decompose(ts_midwest.2[,4]))##residuals are not white noise
plot(decompose(ts_midwest.2[,5]))##residuals are not white noise
plot(decompose(ts_midwest.2[,6]))##residuals are not white noise
plot(decompose(ts_maritimes.2[,4]))
plot(decompose(ts_maritimes.2[,5]))##residuals are not white noise
plot(decompose(ts_maritimes.2[,6]))
plot(decompose(ts_territories.2[,4]))
plot(decompose(ts_territories.2[,5]))
plot(decompose(ts_territories.2[,6]))
##All cancer
ts.plot(ts_ON.2[,4], main = "All sites of cancer")
ts.plot(ts_QC.2[,4], main = "All sites of cancer")
ts.plot(ts_BC.2[,4], main = "All sites of cancer")
ts.plot(ts_midwest.2[,4], main = "All sites of cancer")
ts.plot(ts_maritimes.2[,4], main = "All sites of cancer")
ts.plot(ts_territories.2[,4], main = "All sites of cancer")
#Lung
ts.plot(ts_ON.2[,5], main="Bronchus and lung cancer")
ts.plot(ts_QC.2[,5], main="Bronchus and lung cancer")
ts.plot(ts_BC.2[,5], main="Bronchus and lung cancer")
ts.plot(ts_midwest.2[,5], main="Bronchus and lung cancer")
ts.plot(ts_maritimes.2[,5], main="Bronchus and lung cancer")
ts.plot(ts_territories.2[,5], main="Bronchus and lung cancer")
#Colon
ts.plot(ts_ON.2[,6], main="Colon cancer")
ts.plot(ts_QC.2[,6], main="Colon cancer")
ts.plot(ts_BC.2[,6], main="Colon cancer")
ts.plot(ts_midwest.2[,6], main="Colon cancer")
ts.plot(ts_maritimes.2[,6], main="Colon cancer")
ts.plot(ts_territories.2[,6], main="Colon cancer")
###check ACF to determine what predictive model to use
#Ontario
acf(ts_ON.2[,4], type = "correlation")
acf(ts_ON.2[,4], type= "partial")
acf(ts_ON.2[,5], type = "correlation")
acf(ts_ON.2[,5], type= "partial")
acf(ts_ON.2[,6], type = "correlation")
acf(ts_ON.2[,6], type= "partial")
#Quebec
acf(ts_QC.2[,4], type = "correlation")
acf(ts_QC.2[,4], type= "partial")
acf(ts_QC.2[,5], type = "correlation")
acf(ts_QC.2[,5], type= "partial")
acf(ts_QC.2[,6], type = "correlation")
acf(ts_QC.2[,6], type= "partial")
#BRITISH COLUMBIA
acf(ts_BC.2[,4], type = "correlation")
acf(ts_BC.2[,4], type= "partial")
acf(ts_BC.2[,5], type = "correlation")
acf(ts_BC.2[,5], type= "partial")
acf(ts_BC.2[,6], type = "correlation")
acf(ts_BC.2[,6], type= "partial")
##MIDWEST
acf(ts_midwest.2[,4], type = "correlation")
acf(ts_midwest.2[,4], type= "partial")
acf(ts_midwest.2[,5], type = "correlation")
acf(ts_midwest.2[,5], type= "partial")
acf(ts_midwest.2[,6], type = "correlation")
acf(ts_midwest.2[,6], type= "partial")
#Maritimes
acf(ts_maritimes.2[,4], type = "correlation")
acf(ts_maritimes.2[,4], type= "partial")
acf(ts_maritimes.2[,5], type = "correlation")
acf(ts_maritimes.2[,5], type= "partial")
acf(ts_maritimes.2[,6], type = "correlation")
acf(ts_maritimes.2[,6], type= "partial")
#territories
acf(ts_territories.2[,4], type = "correlation")
acf(ts_territories.2[,4], type= "partial")
acf(ts_territories.2[,5], type = "correlation")
acf(ts_territories.2[,5], type= "partial")
acf(ts_territories.2[,6], type = "correlation")
acf(ts_territories.2[,6], type= "partial")
#####
###doesn't look like any of them are sufficiently stationary
##makes me want to use ARIMA
##time series regression
###see decomposition
##tests for stationarity
#ljung-box - non-stationary will have low p-value
##other tests indicated a lag of 6 so we will see how this changes the ljung-box test
Box.test(ts_ON.2[,4], type="Ljung-Box")
##
## Box-Ljung test
##
## data: ts_ON.2[, 4]
## X-squared = 41.804, df = 1, p-value = 1.009e-10
Box.test(ts_ON.2[,5], type="Ljung-Box")
##
## Box-Ljung test
##
## data: ts_ON.2[, 5]
## X-squared = 20.283, df = 1, p-value = 6.678e-06
Box.test(ts_ON.2[,6], type="Ljung-Box")
##
## Box-Ljung test
##
## data: ts_ON.2[, 6]
## X-squared = 11.866, df = 1, p-value = 0.0005718
Box.test(ts_QC.2[,4], type="Ljung-Box")##fail to reject null of stationarity
##
## Box-Ljung test
##
## data: ts_QC.2[, 4]
## X-squared = 3.5323, df = 1, p-value = 0.06018
Box.test(ts_QC.2[,5], type="Ljung-Box")
##
## Box-Ljung test
##
## data: ts_QC.2[, 5]
## X-squared = 31.127, df = 1, p-value = 2.417e-08
Box.test(ts_QC.2[,6], type="Ljung-Box")
##
## Box-Ljung test
##
## data: ts_QC.2[, 6]
## X-squared = 31.829, df = 1, p-value = 1.684e-08
Box.test(ts_BC.2[,4], type="Ljung-Box")
##
## Box-Ljung test
##
## data: ts_BC.2[, 4]
## X-squared = 14.29, df = 1, p-value = 0.0001567
Box.test(ts_BC.2[,5], type="Ljung-Box")
##
## Box-Ljung test
##
## data: ts_BC.2[, 5]
## X-squared = 6.0415, df = 1, p-value = 0.01397
Box.test(ts_BC.2[,6], type="Ljung-Box")
##
## Box-Ljung test
##
## data: ts_BC.2[, 6]
## X-squared = 7.7952, df = 1, p-value = 0.005239
Box.test(ts_midwest.2[,4], type="Ljung-Box")
##
## Box-Ljung test
##
## data: ts_midwest.2[, 4]
## X-squared = 32.01, df = 1, p-value = 1.534e-08
Box.test(ts_midwest.2[,5], type="Ljung-Box")
##
## Box-Ljung test
##
## data: ts_midwest.2[, 5]
## X-squared = 13.422, df = 1, p-value = 0.0002486
Box.test(ts_midwest.2[,6], type="Ljung-Box")##fail to reject null of stationarity
##
## Box-Ljung test
##
## data: ts_midwest.2[, 6]
## X-squared = 2.085, df = 1, p-value = 0.1487
Box.test(ts_maritimes.2[,4], type="Ljung-Box")
##
## Box-Ljung test
##
## data: ts_maritimes.2[, 4]
## X-squared = 34.233, df = 1, p-value = 4.889e-09
Box.test(ts_maritimes.2[,5], type="Ljung-Box")
##
## Box-Ljung test
##
## data: ts_maritimes.2[, 5]
## X-squared = 41.948, df = 1, p-value = 9.374e-11
Box.test(ts_maritimes.2[,6], type="Ljung-Box")
##
## Box-Ljung test
##
## data: ts_maritimes.2[, 6]
## X-squared = 8.2406, df = 1, p-value = 0.004096
Box.test(ts_territories.2[,4], type="Ljung-Box")##fail to reject null of stationarity
##
## Box-Ljung test
##
## data: ts_territories.2[, 4]
## X-squared = 0.19385, df = 1, p-value = 0.6597
Box.test(ts_territories.2[,5], type="Ljung-Box")##fail to reject null of stationarity
##
## Box-Ljung test
##
## data: ts_territories.2[, 5]
## X-squared = 1.9755, df = 1, p-value = 0.1599
Box.test(ts_territories.2[,6], type="Ljung-Box")##fail to reject null of stationarity
##
## Box-Ljung test
##
## data: ts_territories.2[, 6]
## X-squared = 8.0948e-06, df = 1, p-value = 0.9977
#generally reject the null that these are stationary time series
#stationary might indicate that there is an equally likely
#chance of cancer diagnosis regardless of location
#Augmented dickey-fuller t-stat test for unit root
options(warn = -1)
#non-stationary has a large p-value
adf.test(ts_ON.2[,4])
##
## Augmented Dickey-Fuller Test
##
## data: ts_ON.2[, 4]
## Dickey-Fuller = -8.561, Lag order = 8, p-value = 0.01
## alternative hypothesis: stationary
adf.test(ts_ON.2[,5])
##
## Augmented Dickey-Fuller Test
##
## data: ts_ON.2[, 5]
## Dickey-Fuller = -7.9376, Lag order = 8, p-value = 0.01
## alternative hypothesis: stationary
adf.test(ts_ON.2[,6])
##
## Augmented Dickey-Fuller Test
##
## data: ts_ON.2[, 6]
## Dickey-Fuller = -7.7782, Lag order = 8, p-value = 0.01
## alternative hypothesis: stationary
adf.test(ts_QC.2[,4])
##
## Augmented Dickey-Fuller Test
##
## data: ts_QC.2[, 4]
## Dickey-Fuller = -6.9374, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
adf.test(ts_QC.2[,5])
##
## Augmented Dickey-Fuller Test
##
## data: ts_QC.2[, 5]
## Dickey-Fuller = -7.5649, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
adf.test(ts_QC.2[,6])
##
## Augmented Dickey-Fuller Test
##
## data: ts_QC.2[, 6]
## Dickey-Fuller = -4.9935, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
adf.test(ts_BC.2[,4])
##
## Augmented Dickey-Fuller Test
##
## data: ts_BC.2[, 4]
## Dickey-Fuller = -5.5942, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
adf.test(ts_BC.2[,5])
##
## Augmented Dickey-Fuller Test
##
## data: ts_BC.2[, 5]
## Dickey-Fuller = -5.1055, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
adf.test(ts_BC.2[,6])
##
## Augmented Dickey-Fuller Test
##
## data: ts_BC.2[, 6]
## Dickey-Fuller = -5.318, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
adf.test(ts_midwest.2[,4])
##
## Augmented Dickey-Fuller Test
##
## data: ts_midwest.2[, 4]
## Dickey-Fuller = -7.0176, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
adf.test(ts_midwest.2[,5])
##
## Augmented Dickey-Fuller Test
##
## data: ts_midwest.2[, 5]
## Dickey-Fuller = -7.0317, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
adf.test(ts_midwest.2[,6])
##
## Augmented Dickey-Fuller Test
##
## data: ts_midwest.2[, 6]
## Dickey-Fuller = -6.2022, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
adf.test(ts_maritimes.2[,4])
##
## Augmented Dickey-Fuller Test
##
## data: ts_maritimes.2[, 4]
## Dickey-Fuller = -5.9679, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
adf.test(ts_maritimes.2[,5])
##
## Augmented Dickey-Fuller Test
##
## data: ts_maritimes.2[, 5]
## Dickey-Fuller = -6.2437, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
adf.test(ts_maritimes.2[,6])
##
## Augmented Dickey-Fuller Test
##
## data: ts_maritimes.2[, 6]
## Dickey-Fuller = -6.7881, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
adf.test(ts_territories.2[,4])##fail to reject null of non-stationarity
##
## Augmented Dickey-Fuller Test
##
## data: ts_territories.2[, 4]
## Dickey-Fuller = -2.4322, Lag order = 3, p-value = 0.403
## alternative hypothesis: stationary
adf.test(ts_territories.2[,5])##fail to reject null of non-stationarity
##
## Augmented Dickey-Fuller Test
##
## data: ts_territories.2[, 5]
## Dickey-Fuller = -2.6982, Lag order = 3, p-value = 0.2979
## alternative hypothesis: stationary
adf.test(ts_territories.2[,6])##fail to reject null of non-stationarity
##
## Augmented Dickey-Fuller Test
##
## data: ts_territories.2[, 6]
## Dickey-Fuller = -2.1182, Lag order = 3, p-value = 0.527
## alternative hypothesis: stationary
##reject the null of non-stationarity for all except ^^
##KPSS for level or trend stationarity
options(warn = -1)
## lower p-value indicates non stationarity
kpss.test(ts_ON.2[,4], null = "Trend")
##
## KPSS Test for Trend Stationarity
##
## data: ts_ON.2[, 4]
## KPSS Trend = 0.03125, Truncation lag parameter = 6, p-value = 0.1
kpss.test(ts_ON.2[,5], null = "Trend")
##
## KPSS Test for Trend Stationarity
##
## data: ts_ON.2[, 5]
## KPSS Trend = 0.02115, Truncation lag parameter = 6, p-value = 0.1
kpss.test(ts_ON.2[,6], null = "Trend")
##
## KPSS Test for Trend Stationarity
##
## data: ts_ON.2[, 6]
## KPSS Trend = 0.045779, Truncation lag parameter = 6, p-value = 0.1
kpss.test(ts_QC.2[,4], null = "Trend")
##
## KPSS Test for Trend Stationarity
##
## data: ts_QC.2[, 4]
## KPSS Trend = 0.027699, Truncation lag parameter = 5, p-value = 0.1
kpss.test(ts_QC.2[,5], null = "Trend")
##
## KPSS Test for Trend Stationarity
##
## data: ts_QC.2[, 5]
## KPSS Trend = 0.030488, Truncation lag parameter = 5, p-value = 0.1
kpss.test(ts_QC.2[,6], null = "Trend")
##
## KPSS Test for Trend Stationarity
##
## data: ts_QC.2[, 6]
## KPSS Trend = 0.043577, Truncation lag parameter = 5, p-value = 0.1
kpss.test(ts_BC.2[,4], null = "Trend")
##
## KPSS Test for Trend Stationarity
##
## data: ts_BC.2[, 4]
## KPSS Trend = 0.094938, Truncation lag parameter = 4, p-value = 0.1
kpss.test(ts_BC.2[,5], null = "Trend")
##
## KPSS Test for Trend Stationarity
##
## data: ts_BC.2[, 5]
## KPSS Trend = 0.033942, Truncation lag parameter = 4, p-value = 0.1
kpss.test(ts_BC.2[,6], null = "Trend")
##
## KPSS Test for Trend Stationarity
##
## data: ts_BC.2[, 6]
## KPSS Trend = 0.04278, Truncation lag parameter = 4, p-value = 0.1
kpss.test(ts_midwest.2[,4], null = "Trend")##reject the null that this is level stationary
##
## KPSS Test for Trend Stationarity
##
## data: ts_midwest.2[, 4]
## KPSS Trend = 0.096673, Truncation lag parameter = 5, p-value = 0.1
kpss.test(ts_midwest.2[,5], null = "Trend")##reject the null that this is level stationary
##
## KPSS Test for Trend Stationarity
##
## data: ts_midwest.2[, 5]
## KPSS Trend = 0.048228, Truncation lag parameter = 5, p-value = 0.1
kpss.test(ts_midwest.2[,6], null = "Trend")##reject the null that this is level/trend stationary
##
## KPSS Test for Trend Stationarity
##
## data: ts_midwest.2[, 6]
## KPSS Trend = 0.3926, Truncation lag parameter = 5, p-value = 0.01
kpss.test(ts_maritimes.2[,4], null = "Trend")##reject the null that this is level/trend stationary
##
## KPSS Test for Trend Stationarity
##
## data: ts_maritimes.2[, 4]
## KPSS Trend = 0.16538, Truncation lag parameter = 5, p-value = 0.03385
kpss.test(ts_maritimes.2[,5], null = "Trend")##reject the null that this is trend stationary
##
## KPSS Test for Trend Stationarity
##
## data: ts_maritimes.2[, 5]
## KPSS Trend = 0.3217, Truncation lag parameter = 5, p-value = 0.01
kpss.test(ts_maritimes.2[,6], null = "Trend")##reject the null that this is level/trend stationary
##
## KPSS Test for Trend Stationarity
##
## data: ts_maritimes.2[, 6]
## KPSS Trend = 0.1833, Truncation lag parameter = 5, p-value = 0.02226
kpss.test(ts_territories.2[,4], null = "Trend")
##
## KPSS Test for Trend Stationarity
##
## data: ts_territories.2[, 4]
## KPSS Trend = 0.097352, Truncation lag parameter = 3, p-value = 0.1
kpss.test(ts_territories.2[,5], null = "Trend")
##
## KPSS Test for Trend Stationarity
##
## data: ts_territories.2[, 5]
## KPSS Trend = 0.075875, Truncation lag parameter = 3, p-value = 0.1
kpss.test(ts_territories.2[,6], null = "Trend")
##
## KPSS Test for Trend Stationarity
##
## data: ts_territories.2[, 6]
## KPSS Trend = 0.065017, Truncation lag parameter = 3, p-value = 0.1
##cannot reject the null of stationarity, p-value indicates they could be stationary
split.ON<-ts_split(ts.obj=ts_ON.2, sample.out =104)
train.ON<- split.ON$train
test.ON<- split.ON$test
split.QC<- ts_split(ts_QC.2, sample.out = 38)
train.QC<- split.QC$train
test.QC<- split.QC$test
split.BC<- ts_split(ts_BC.2, sample.out = 34)
train.BC<- split.BC$train
test.BC<- split.BC$test
split.mid<- ts_split(ts_midwest.2, sample.out =48)
train.mid<- split.mid$train
test.mid<- split.mid$test
split.mar<- ts_split(ts_maritimes.2, sample.out = 38)
train.mar<- split.mar$train
test.mar<- split.mar$test
split.terr<- ts_split(ts_territories.2, sample.out = 6)
train.terr<-split.terr$train
test.terr<-split.terr$test
#TSLM
#need to train models
##time series regression [tsr]-> tslm
tsr_ON<- tslm(formula= train.ON[,4]~GEO+Sex,data=train.ON)
summary(tsr_ON)
##
## Call:
## tslm(formula = train.ON[, 4] ~ GEO + Sex, data = train.ON)
##
## Residuals:
## Min 1Q Median 3Q Max
## -177.08 -56.27 14.43 48.32 181.35
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 431.36473 7.38936 58.377 <2e-16 ***
## GEO 0.13625 0.06517 2.091 0.037 *
## Sex 42.49623 2.98760 14.224 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 61.01 on 622 degrees of freedom
## Multiple R-squared: 0.2496, Adjusted R-squared: 0.2472
## F-statistic: 103.4 on 2 and 622 DF, p-value: < 2.2e-16
tsr_ON.5<- tslm(formula= train.ON[,5]~GEO+Sex,data=train.ON)
summary(tsr_ON.5)
##
## Call:
## tslm(formula = train.ON[, 5] ~ GEO + Sex, data = train.ON)
##
## Residuals:
## Min 1Q Median 3Q Max
## -42.956 -10.918 -0.237 11.334 109.730
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 54.58994 2.06089 26.489 <2e-16 ***
## GEO 0.03397 0.01818 1.869 0.0621 .
## Sex 7.98038 0.83324 9.578 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.02 on 622 degrees of freedom
## Multiple R-squared: 0.1329, Adjusted R-squared: 0.1301
## F-statistic: 47.66 on 2 and 622 DF, p-value: < 2.2e-16
tsr_ON.6<- tslm(formula= train.ON[,6]~GEO+Sex,data=train.ON)
summary(tsr_ON.6)
##
## Call:
## tslm(formula = train.ON[, 6] ~ GEO + Sex, data = train.ON)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29.811 -10.100 1.616 7.415 51.972
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52.45088 1.47359 35.594 < 2e-16 ***
## GEO 0.03957 0.01300 3.045 0.00243 **
## Sex 6.90200 0.59579 11.585 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.17 on 622 degrees of freedom
## Multiple R-squared: 0.1876, Adjusted R-squared: 0.185
## F-statistic: 71.84 on 2 and 622 DF, p-value: < 2.2e-16
##tsr_ON.7 <- tslm(formula= train.ON[,7]~GEO, data=train.ON)
##summary(tsr_ON.7)
##tsr_ON.8<- tslm(formula= train.ON[,8]~GEO, data=train.ON)
##summary(tsr_ON.8)
tsr_QC<- tslm(formula = train.QC[,4]~GEO+Sex, data = train.QC)
summary(tsr_QC)
##
## Call:
## tslm(formula = train.QC[, 4] ~ GEO + Sex, data = train.QC)
##
## Residuals:
## Min 1Q Median 3Q Max
## -244.95 -85.53 12.03 50.62 678.27
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 117.039 111.360 1.051 0.294382
## GEO 4.239 1.265 3.352 0.000941 ***
## Sex 48.832 8.506 5.741 3.02e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 105.2 on 226 degrees of freedom
## Multiple R-squared: 0.1646, Adjusted R-squared: 0.1572
## F-statistic: 22.26 on 2 and 226 DF, p-value: 1.494e-09
tsr_QC.5<- tslm(formula= train.QC[,5]~GEO+Sex, data=train.QC)
summary(tsr_QC.5)
##
## Call:
## tslm(formula = train.QC[, 5] ~ GEO + Sex, data = train.QC)
##
## Residuals:
## Min 1Q Median 3Q Max
## -125.763 -30.540 -2.385 19.302 311.667
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -105.5370 55.1246 -1.915 0.05682 .
## GEO 2.1234 0.6261 3.392 0.00082 ***
## Sex 17.9715 4.2108 4.268 2.9e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 52.08 on 226 degrees of freedom
## Multiple R-squared: 0.1171, Adjusted R-squared: 0.1093
## F-statistic: 14.99 on 2 and 226 DF, p-value: 7.733e-07
tsr_QC.6<- tslm(formula= train.QC[,6]~GEO+Sex, data=train.QC)
summary(tsr_QC.6)
##
## Call:
## tslm(formula = train.QC[, 6] ~ GEO + Sex, data = train.QC)
##
## Residuals:
## Min 1Q Median 3Q Max
## -45.410 -17.142 -5.573 7.205 299.661
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -154.4394 39.7898 -3.881 0.000136 ***
## GEO 2.4964 0.4519 5.524 9.08e-08 ***
## Sex 10.7403 3.0394 3.534 0.000497 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 37.59 on 226 degrees of freedom
## Multiple R-squared: 0.1609, Adjusted R-squared: 0.1535
## F-statistic: 21.67 on 2 and 226 DF, p-value: 2.452e-09
##tsr_QC.7<- tslm(formula= train.QC[,7]~GEO, data=train.QC)
##summary(tsr_QC.7)
#tsr_QC.8<- tslm(formula= train.QC[,8]~GEO, data=train.QC)
#summary(tsr_QC.8)
tsr_BC<- tslm(formula = train.BC[,4]~GEO+Sex, data = train.BC)
summary(tsr_BC)
##
## Call:
## tslm(formula = train.BC[, 4] ~ GEO + Sex, data = train.BC)
##
## Residuals:
## Min 1Q Median 3Q Max
## -143.711 -47.297 9.945 45.519 150.076
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 427.4896 12.3623 34.580 < 2e-16 ***
## GEO -0.1688 0.1171 -1.442 0.151
## Sex 37.6058 4.9332 7.623 9.47e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 57.74 on 202 degrees of freedom
## Multiple R-squared: 0.229, Adjusted R-squared: 0.2214
## F-statistic: 30 on 2 and 202 DF, p-value: 3.912e-12
tsr_BC.5<- tslm(formula = train.BC[,5]~GEO+Sex, data = train.BC)
summary(tsr_BC.5)
##
## Call:
## tslm(formula = train.BC[, 5] ~ GEO + Sex, data = train.BC)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.630 -8.844 -0.679 8.441 31.503
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 61.48881 2.71480 22.649 < 2e-16 ***
## GEO -0.05301 0.02571 -2.062 0.0405 *
## Sex 5.79429 1.08334 5.349 2.39e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.68 on 202 degrees of freedom
## Multiple R-squared: 0.1392, Adjusted R-squared: 0.1307
## F-statistic: 16.33 on 2 and 202 DF, p-value: 2.664e-07
tsr_BC.6<-tslm(formula = train.BC[,6]~GEO+Sex, data=train.BC)
summary(tsr_BC.6)
##
## Call:
## tslm(formula = train.BC[, 6] ~ GEO + Sex, data = train.BC)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.845 -7.052 1.992 6.410 26.953
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52.10754 2.18667 23.830 < 2e-16 ***
## GEO -0.01514 0.02071 -0.731 0.466
## Sex 5.76268 0.87259 6.604 3.46e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.21 on 202 degrees of freedom
## Multiple R-squared: 0.1791, Adjusted R-squared: 0.171
## F-statistic: 22.03 on 2 and 202 DF, p-value: 2.209e-09
tsr_mid<-tslm(formula = train.mid[,4]~GEO+Sex, data=train.mid)
summary(tsr_mid)
##
## Call:
## tslm(formula = train.mid[, 4] ~ GEO + Sex, data = train.mid)
##
## Residuals:
## Min 1Q Median 3Q Max
## -112.80 -39.73 13.94 38.60 101.23
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 484.30711 9.40910 51.472 < 2e-16 ***
## GEO -0.30077 0.08149 -3.691 0.000268 ***
## Sex 36.03721 3.59248 10.031 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 49.91 on 286 degrees of freedom
## Multiple R-squared: 0.2844, Adjusted R-squared: 0.2794
## F-statistic: 56.84 on 2 and 286 DF, p-value: < 2.2e-16
tsr_mid.5<-tslm(formula = train.mid[,5]~GEO+Sex, data=train.mid)
summary(tsr_mid.5)
##
## Call:
## tslm(formula = train.mid[, 5] ~ GEO + Sex, data = train.mid)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.634 -8.517 -2.382 6.140 75.202
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 67.83188 2.97618 22.792 < 2e-16 ***
## GEO -0.03844 0.02578 -1.491 0.137
## Sex 6.15756 1.13633 5.419 1.28e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.79 on 286 degrees of freedom
## Multiple R-squared: 0.09911, Adjusted R-squared: 0.09281
## F-statistic: 15.73 on 2 and 286 DF, p-value: 3.295e-07
tsr_mid.6<- tslm(formula = train.mid[,6]~GEO+Sex, data=train.mid)
summary(tsr_mid.6)
##
## Call:
## tslm(formula = train.mid[, 6] ~ GEO + Sex, data = train.mid)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.983 -11.507 1.895 8.593 35.078
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58.93722 2.44946 24.061 < 2e-16 ***
## GEO -0.02100 0.02122 -0.990 0.323
## Sex 7.11474 0.93523 7.608 4.06e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.99 on 286 degrees of freedom
## Multiple R-squared: 0.1704, Adjusted R-squared: 0.1646
## F-statistic: 29.37 on 2 and 286 DF, p-value: 2.504e-12
tsr_mar<-tslm(formula = train.mar[,4]~GEO+Sex, data= train.mar)
summary(tsr_mar)
##
## Call:
## tslm(formula = train.mar[, 4] ~ GEO + Sex, data = train.mar)
##
## Residuals:
## Min 1Q Median 3Q Max
## -184.48 -64.61 16.89 53.30 112.02
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 473.150844 22.772138 20.778 <2e-16 ***
## GEO -0.001946 0.170043 -0.011 0.991
## Sex 50.093376 5.455674 9.182 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 67.48 on 226 degrees of freedom
## Multiple R-squared: 0.2717, Adjusted R-squared: 0.2652
## F-statistic: 42.15 on 2 and 226 DF, p-value: 2.764e-16
tsr_mar.5<- tslm(formula = train.mar[,5]~GEO+Sex, data = train.mar)
summary(tsr_mar.5)
##
## Call:
## tslm(formula = train.mar[, 5] ~ GEO + Sex, data = train.mar)
##
## Residuals:
## Min 1Q Median 3Q Max
## -42.55 -12.96 2.84 11.12 61.33
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 66.28167 5.51549 12.017 < 2e-16 ***
## GEO 0.01807 0.04119 0.439 0.661
## Sex 11.10074 1.32138 8.401 4.94e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.34 on 226 degrees of freedom
## Multiple R-squared: 0.2384, Adjusted R-squared: 0.2317
## F-statistic: 35.37 on 2 and 226 DF, p-value: 4.314e-14
tsr_mar.6<- tslm(formula = train.mar[,6]~GEO+Sex, data = train.mar)
summary(tsr_mar.6)
##
## Call:
## tslm(formula = train.mar[, 6] ~ GEO + Sex, data = train.mar)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.351 -10.640 2.387 8.646 28.017
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 60.71140 4.22475 14.370 < 2e-16 ***
## GEO 0.01491 0.03155 0.473 0.637
## Sex 8.28765 1.01215 8.188 1.95e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.52 on 226 degrees of freedom
## Multiple R-squared: 0.2293, Adjusted R-squared: 0.2225
## F-statistic: 33.62 on 2 and 226 DF, p-value: 1.648e-13
tsr_terr<- tslm(formula = train.terr[,4]~GEO+Sex, data = train.terr)
summary(tsr_terr)
##
## Call:
## tslm(formula = train.terr[, 4] ~ GEO + Sex, data = train.terr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -116.667 -27.504 -0.104 31.033 103.371
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 488.04 21.45 22.755 < 2e-16 ***
## GEO NA NA NA NA
## Sex 27.36 10.04 2.727 0.00993 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 50.15 on 35 degrees of freedom
## Multiple R-squared: 0.1752, Adjusted R-squared: 0.1516
## F-statistic: 7.434 on 1 and 35 DF, p-value: 0.009932
tsr_terr.5<- tslm(formula = train.terr[,5]~GEO+Sex, data = train.terr)
summary(tsr_terr.5)
##
## Call:
## tslm(formula = train.terr[, 5] ~ GEO + Sex, data = train.terr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.54 -10.51 -2.44 12.83 31.19
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 76.301 6.455 11.820 8.98e-14 ***
## GEO NA NA NA NA
## Sex 5.070 3.021 1.678 0.102
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.09 on 35 degrees of freedom
## Multiple R-squared: 0.07449, Adjusted R-squared: 0.04805
## F-statistic: 2.817 on 1 and 35 DF, p-value: 0.1022
tsr_terr.6<-tslm(formula = train.terr[,6]~GEO+Sex, data = train.terr)
##test predictions
newdata.1<- data.frame(
ï..REF_DATE = test.ON[,1],
GEO = test.ON[,2],
Sex = test.ON[,3]
)
fcast.ON.4<-forecast(tsr_ON, newdata = newdata.1,h=2)
autoplot(fcast.ON.4)+
autolayer(test.ON[,4])
newdata.2<- data.frame(
GEO = test.ON[,2],
Sex = test.ON[,3]
)
fcast.ON.5<-forecast(tsr_ON.5, newdata = newdata.2,h=2)
autoplot(fcast.ON.5)+
autolayer(test.ON[,5])
newdata.3<- data.frame(
GEO = test.ON[,2],
Sex = test.ON[,3]
)
fcast.ON.6<-forecast(tsr_ON.6, newdata = newdata.3,h=2)
autoplot(fcast.ON.6, h=2)+
autolayer(test.ON[,6])
newdata.1.1<- data.frame(
GEO = test.QC[,2],
Sex = test.QC[,3]
)
fcast.QC.4<-forecast(tsr_QC, newdata = newdata.1.1,h=2)
autoplot(fcast.QC.4)+
autolayer(test.QC[,4])
newdata.2.1<- data.frame(
GEO = test.QC[,2],
Sex = test.QC[,3]
)
fcast.QC.5<-forecast(tsr_QC.5, newdata = newdata.2.1,h=2)
autoplot(fcast.QC.5)+
autolayer(test.QC[,5])
newdata.3.1<- data.frame(
GEO = test.QC[,2],
Sex = test.QC[,3]
)
fcast.QC.6<-forecast(tsr_QC.6, newdata = newdata.3.1,h=2)
autoplot(fcast.QC.6)+
autolayer(test.QC[,6])
newdata.1.2<- data.frame(
GEO = test.BC[,2],
Sex = test.BC[,3]
)
fcast.BC.4<-forecast(tsr_BC, newdata = newdata.1.2,h=2)
autoplot(fcast.BC.4)+
autolayer(test.BC[,4])
newdata.2.2<- data.frame(
GEO = test.BC[,2],
Sex = test.BC[,3]
)
fcast.BC.5<-forecast(tsr_BC.5, newdata = newdata.2.2,h=2)
autoplot(fcast.BC.5)+
autolayer(test.BC[,5])
newdata.3.2<- data.frame(
GEO = test.BC[,2],
Sex = test.BC[,3]
)
fcast.BC.6<-forecast(tsr_BC.6, newdata = newdata.3.2,h=2)
autoplot(fcast.BC.6)+
autolayer(test.BC[,6])
newdata.1.3<- data.frame(
GEO = test.mid[,2],
Sex = test.mid[,3]
)
fcast.mid.4<-forecast(tsr_mid, newdata = newdata.1.3,h=2)
autoplot(fcast.mid.4)+
autolayer(test.mid[,4])
newdata.2.3<- data.frame(
GEO = test.mid[,2],
Sex = test.mid[,3]
)
fcast.mid.5<-forecast(tsr_mid.5, newdata = newdata.2.3,h=2)
autoplot(fcast.mid.5)+
autolayer(test.mid[,5])
newdata.3.3<- data.frame(
GEO = test.mid[,2],
Sex = test.mid[,3]
)
fcast.mid.6<-forecast(tsr_mid.6, newdata = newdata.3.3,h=2)
autoplot(fcast.mid.6)+
autolayer(test.mid[,6])
newdata.1.4<- data.frame(
GEO = test.mar[,2],
Sex = test.mar[,3]
)
fcast.mar.4<-forecast(tsr_mar, newdata = newdata.1.4,h=2)
autoplot(fcast.mar.4)+
autolayer(test.mar[,4])
newdata.2.4<- data.frame(
GEO = test.mar[,2],
Sex = test.mar[,3]
)
fcast.mar.5<-forecast(tsr_mar.5, newdata = newdata.2.4,h=2)
autoplot(fcast.mar.5)+
autolayer(test.mar[,5])
newdata.3.4<- data.frame(
GEO = test.mar[,2],
Sex = test.mar[,3]
)
fcast.mar.6<-forecast(tsr_mar.6, newdata = newdata.3.4,h=2)
autoplot(fcast.mar.6)+
autolayer(test.mar[,6])
newdata.1.5<- data.frame(
GEO = test.terr[,2],
Sex = test.terr[,3]
)
fcast.terr.4<-forecast(tsr_terr, newdata = newdata.1.5,h=2)
autoplot(fcast.terr.4)+
autolayer(test.terr[,4])
newdata.2.5<- data.frame(
GEO = test.terr[,2],
Sex = test.terr[,3]
)
fcast.terr.5<-forecast(tsr_terr.5, newdata = newdata.2.5,h=2)
autoplot(fcast.terr.5)+
autolayer(test.terr[,5])
newdata.3.5<- data.frame(
GEO = test.terr[,2],
Sex = test.terr[,3]
)
fcast.terr.6<-forecast(tsr_terr.6, newdata = newdata.3.5,h=2)
autoplot(fcast.terr.6)+
autolayer(test.terr[,6])
##ARIMA model or SARIMA
##trying non seasonal ARIMA models
##with the currently non differenced data
##staring with auto.arima
####################All cancer sites
#(auto.arima(train.ON[,4], seasonal = TRUE))
#(Arima(train.ON[,4], order = c(1,0,5), seasonal = c(1,0,1)))#model from auto.arima
#AIC=6659.09 AICc=6659.45 BIC=6703.47
#(Arima(train.ON[,4], order = c(1,1,6), seasonal = c(0,1,1)))
#(arima_ON.m<- Arima(train.ON[,4], order = c(1,1,6), seasonal = c(1,1,1)))
#AIC=6127.88 AICc=6128.27 BIC=6171.37
#(arima_ON.m<- Arima(train.ON[,4], order = c(1,1,8), seasonal = c(0,1,1)))
#AIC=6089.67 AICc=6090.14 BIC=6137.51
arima_ON.m<- Arima(train.ON[,4], order = c(1,1,8), seasonal = c(0,1,2))####BEST AIC
#AIC=5865.88 AICc=5866.44 BIC=5918.07
#################Lungs
#(arima_ON.5<- auto.arima(train.ON[,5], seasonal = TRUE))
#(arima_ON.5<- Arima(train.ON[,5], order = c(1,0,5), seasonal = c(0,0,1)))#model from auto.arima
#AIC=5183.45 AICc=5183.75 BIC=5223.39
#(arima_ON.5.m<- Arima(train.ON[,5], order = c(1,0,5), seasonal = c(0,1,1)))
# AIC=4791.34 AICc=4791.6 BIC=4826.15
#(arima_ON.5.m<- Arima(train.ON[,5], order = c(1,0,7), seasonal = c(0,1,1)))
#AIC=4742.87 AICc=4743.26 BIC=4786.38
#(arima_ON.5.m<- Arima(train.ON[,5], order = c(1,0,8), seasonal = c(0,1,2)))
#AIC=4506.99 AICc=4507.55 BIC=4559.2
arima_ON.5.m<- Arima(train.ON[,5], order = c(1,0,9), seasonal = c(0,1,2))##BEST AIC
#AIC=4497.12 AICc=4497.77 BIC=4553.68
#############################COLON
#(arima_ON.6<- auto.arima(train.ON[,6], seasonal = FALSE)) #False higher AIC than True
#(arima_ON.6.m<- Arima(train.ON[,6], order = c(1,0,5), seasonal = c(0,0,0)))#model from auto.arima
#AIC=4753.36 AICc=4753.59 BIC=4788.86
#(arima_ON.6.m<- Arima(train.ON[,6], order = c(1,0,6), seasonal = c(0,0,0)))
#AIC=4689.1 AICc=4689.39 BIC=4729.04
#(arima_ON.6.m<- Arima(train.ON[,6], order = c(1,0,5), seasonal = c(0,1,1)))
#AIC=4437.09 AICc=4437.35 BIC=4471.9
#(arima_ON.6.m<- Arima(train.ON[,6], order = c(1,0,6), seasonal = c(0,1,1)))
#AIC=4352.96 AICc=4353.28 BIC=4392.12
#(arima_ON.6.m<- Arima(train.ON[,6], order = c(1,0,7), seasonal = c(0,1,1)))
#AIC=4335.71 AICc=4336.1 BIC=4379.22
arima_ON.6.m<- Arima(train.ON[,6], order = c(1,0,8), seasonal = c(0,1,2))##BEST AIC
#AIC=4117.67 AICc=4118.23 BIC=4169.88
fcast_ON.4<-forecast(arima_ON.m)
autoplot(fcast_ON.4)+
autolayer(test.ON[,4])
fcast_ON.5<-forecast(arima_ON.5.m)
autoplot(fcast_ON.5)+
autolayer(test.ON[,5])
fcast_ON.6<-forecast(arima_ON.6.m)
autoplot(fcast_ON.6)+
autolayer(test.ON[,6])
#######################All cancer sites
#(arima_QC<- auto.arima(train.QC[,4], seasonal = TRUE))
#(arima_QC<- Arima(train.QC[,4], order = c(1,0,1), seasonal = c(0,0,1)))##auto.arima model
#AIC=2813.12 AICc=2813.38 BIC=2830.28
#(arima_QC.m<- Arima(train.QC[,4], order = c(1,0,3), seasonal = c(0,0,1)))
#AIC=2799.16 AICc=2799.66 BIC=2823.19
#(arima_QC.m<- Arima(train.QC[,4], order = c(1,0,3), seasonal = c(0,1,1)))
#AIC=2570.48 AICc=2570.89 BIC=2590.56
#(arima_QC.m<- Arima(train.QC[,4], order = c(1,1,4), seasonal = c(0,1,1)))
#AIC=2568.64 AICc=2569.2 BIC=2592.04
arima_QC.m<- Arima(train.QC[,4], order = c(1,1,4), seasonal = c(0,1,2))##BEST AIC
#AIC=2526.05 AICc=2526.77 BIC=2552.79
##########################LUNGS
#(arima_QC.5<- auto.arima(train.QC[,5], seasonal = TRUE))
#(arima_QC.5<-Arima(train.QC[,5], order = c(0,0,1), seasonal = c(0,0,2)))##auto.arima model
#AIC=2407.76 AICc=2408.02 BIC=2424.92
#(arima_QC.5.m<-Arima(train.QC[,5], order = c(1,0,4), seasonal = c(0,1,2)))
#AIC=2190.36 AICc=2191.08 BIC=2217.14
#(arima_QC.5.m<-Arima(train.QC[,5], order = c(1,1,4), seasonal = c(0,1,2)))
#AIC=2188.35 AICc=2189.07 BIC=2215.09
#(arima_QC.5.m<-Arima(train.QC[,5], order = c(1,1,4), seasonal = c(0,1,3)))
#AIC=2178.95 AICc=2179.85 BIC=2209.03
#(arima_QC.5.m<-Arima(train.QC[,5], order = c(1,1,4), seasonal = c(0,1,4)))
#AIC=2125.93 AICc=2127.04 BIC=2159.36
arima_QC.5.m<-Arima(train.QC[,5], order = c(1,1,4), seasonal = c(0,1,5))
#AIC=2104.38 AICc=2105.72 BIC=2141.14
############################# Colon
#(arima_QC.6<- auto.arima(train.QC[,6], seasonal = TRUE))
#(arima_QC.6.m<-Arima(train.QC[,6], order = c(0,0,3), seasonal = c(0,0,1)))##auto.arima model
#AIC=2258.1 AICc=2258.48 BIC=2278.7
#(arima_QC.6.m<-Arima(train.QC[,6], order = c(4,0,3), seasonal = c(0,0,1)))
#AIC=2250.51 AICc=2251.52 BIC=2284.85
#(arima_QC.6.m<-Arima(train.QC[,6], order = c(1,0,3), seasonal = c(0,1,1)))
#AIC=2114.69 AICc=2115.11 BIC=2134.78
#(arima_QC.6.m<-Arima(train.QC[,6], order = c(1,0,3), seasonal = c(0,2,1)))
#AIC=2110.41 AICc=2110.86 BIC=2129.92
#(arima_QC.6.m<-Arima(train.QC[,6], order = c(1,0,3), seasonal = c(0,2,4)))
#AIC=2005.74 AICc=2006.74 BIC=2035.01
arima_QC.6.m<-Arima(train.QC[,6], order = c(1,0,3), seasonal = c(0,2,5))##BEST AIC
#AIC=1980.22 AICc=1981.44 BIC=2012.74
fcast_QC.4<-forecast(arima_QC.m)
autoplot(fcast_QC.4)+
autolayer(test.QC[,4])
fcast_QC.5<-forecast(arima_QC.5.m)
autoplot(fcast_QC.5)+
autolayer(test.QC[,5])
fcast_QC.6<-forecast(arima_QC.6.m)
autoplot(fcast_QC.6)+
autolayer(test.QC[,6])
##############################ALL cancer sites
#(arima_BC<-auto.arima(train.BC[,4], seasonal = TRUE))
#(arima_BC<- Arima(train.BC[,4], order = c(1,0,5), seasonal = c(0,0,1)))#auto.arima model
#AIC=2187.68 AICc=2188.6 BIC=2217.58
#(arima_BC.m<- Arima(train.BC[,4], order = c(1,0,5), seasonal = c(0,1,1)))
#AIC=2041.8 AICc=2042.6 BIC=2067.69
#(arima_BC.m<- Arima(train.BC[,4], order = c(1,0,6), seasonal = c(0,1,1)))
#AIC=2020.01 AICc=2021.02 BIC=2049.14
#(arima_BC.m<- Arima(train.BC[,4], order = c(1,0,9), seasonal = c(0,1,2)))
#AIC=1917.45 AICc=1919.54 BIC=1959.52
#(arima_BC.m<- Arima(train.BC[,4], order = c(1,0,9), seasonal = c(0,1,3)))
#AIC=1906.1 AICc=1908.53 BIC=1951.41
#(arima_BC.m<- Arima(train.BC[,4], order = c(1,0,9), seasonal = c(0,1,4)))
#AIC=1847.06 AICc=1849.85 BIC=1895.61
#(arima_BC.m<- Arima(train.BC[,4], order = c(1,0,9), seasonal = c(0,1,5)))
#AIC=1823.34 AICc=1826.52 BIC=1875.12
#(arima_BC.m<- Arima(train.BC[,4], order = c(1,0,9), seasonal = c(0,1,6)))
#AIC=1815.42 AICc=1819.02 BIC=1870.44
#(arima_BC.m<- Arima(train.BC[,4], order = c(1,0,9), seasonal = c(0,1,7)))
#AIC=1784.21 AICc=1788.26 BIC=1842.46
arima_BC.m<- Arima(train.BC[,4], order = c(1,0,9), seasonal = c(0,1,8))
#AIC=1779.17 AICc=1783.69 BIC=1840.66
##takes forever to run
#############################################LUNGS
#(arima_BC.5<-auto.arima(train.BC[,5], seasonal = FALSE))##better than seasonal=TRUE
#AIC=1531.24 AICc=1531.98 BIC=1557.82
#(arima_BC.5.m<- Arima(train.BC[,5], order = c(2,0,4), seasonal = c(0,1,1)))
#AIC=1358.72 AICc=1359.53 BIC=1384.62
#(arima_BC.5.m<- Arima(train.BC[,5], order = c(2,0,4), seasonal = c(1,1,1)))
#AIC=1334.85 AICc=1335.86 BIC=1363.98
#(arima_BC.5.m<- Arima(train.BC[,5], order = c(2,0,4), seasonal = c(1,1,4)))
#AIC=1242.85 AICc=1244.64 BIC=1281.69
arima_BC.5.m<- Arima(train.BC[,5], order = c(2,0,4), seasonal = c(2,1,4))###BEST AIC
#AIC=1148.16 AICc=1150.25 BIC=1190.23
################################# Colon
#(arima_BC.6<-auto.arima(train.BC[,6], seasonal = TRUE))
#(arima_BC.6.m<- Arima(train.BC[,6], order = c(1,0,5), seasonal = c(0,0,2)))#auto.arima model
#AIC=1444.7 AICc=1445.83 BIC=1477.93
#(arima_BC.6.m<- Arima(train.BC[,6], order = c(1,0,5), seasonal = c(0,1,2)))
#AIC=1361.42 AICc=1362.43 BIC=1390.55
#(arima_BC.6.m<- Arima(train.BC[,6], order = c(1,0,5), seasonal = c(0,1,4)))
#AIC=1302.51 AICc=1304.01 BIC=1338.11
arima_BC.6.m<- Arima(train.BC[,6], order = c(1,0,5), seasonal = c(0,1,5))##BEST AIC
#AIC=1295.83 AICc=1297.62 BIC=1334.67
fcast_BC.4<-forecast(arima_BC.m)
autoplot(fcast_BC.4)+
autolayer(test.BC[,4])
fcast_BC.5<-forecast(arima_BC.5.m)
autoplot(fcast_BC.5)+
autolayer(test.BC[,5])
fcast_BC.6<-forecast(arima_BC.6.m)
autoplot(fcast_BC.6)+
autolayer(test.BC[,6])
##################################ALL cancer sites
#(arima_mid<- auto.arima(train.mid[,4], seasonal= TRUE))
#(arima_mid.m<- Arima(train.mid[,4], order = c(2,0,2), seasonal = c(1,1,0)))#auto.arima model
#AIC=2581.3 AICc=2581.62 BIC=2602.77
#(arima_mid.m<- Arima(train.mid[,4], order = c(2,0,3), seasonal = c(1,1,0)))
#AIC=2577.45 AICc=2577.88 BIC=2602.51
#(arima_mid.m<- Arima(train.mid[,4], order = c(2,0,3), seasonal = c(1,1,1)))
#AIC=2555.31 AICc=2555.87 BIC=2583.95
#(arima_mid.m<- Arima(train.mid[,4], order = c(2,0,3), seasonal = c(1,2,1)))
#AIC=2442.21 AICc=2442.83 BIC=2470.09
#(arima_mid.m<- Arima(train.mid[,4], order = c(2,0,3), seasonal = c(1,2,3)))
#AIC=2419.11 AICc=2420.07 BIC=2453.96
#(arima_mid.m<- Arima(train.mid[,4], order = c(2,0,3), seasonal = c(1,2,5)))
#AIC=2412.21 AICc=2413.58 BIC=2454.02
arima_mid.m<- Arima(train.mid[,4], order = c(2,0,3), seasonal = c(1,2,6))##no noticeable improvement from 6->8
#AIC=2405.48 AICc=2407.08 BIC=2450.78
###########################################LUNGS
#(arima_mid.5<- auto.arima(train.mid[,5], seasonal= TRUE))
#(arima_mid.5.m<- Arima(train.mid[,5], order = c(5,1,2), seasonal = c(2,0,1)))##auto.arima model
#AIC=2239.1 AICc=2240.24 BIC=2283.06
#(arima_mid.5.m<- Arima(train.mid[,5], order = c(5,1,3), seasonal = c(2,0,1)))
#AIC=2169.99 AICc=2171.12 BIC=2213.94
#(arima_mid.5.m<- Arima(train.mid[,5], order = c(5,1,3), seasonal = c(2,1,1)))
#AIC=2022.71 AICc=2023.95 BIC=2065.62
#(arima_mid.5.m<- Arima(train.mid[,5], order = c(5,1,3), seasonal = c(2,1,2)))
#AIC=2020.75 AICc=2022.2 BIC=2067.23
#(arima_mid.5.m<- Arima(train.mid[,5], order = c(5,1,3), seasonal = c(2,2,2)))
#AIC=2000.26 AICc=2001.87 BIC=2045.51
#(arima_mid.5.m<- Arima(train.mid[,5], order = c(5,2,3), seasonal = c(2,2,2)))
#AIC=1991.14 AICc=1992.75 BIC=2036.33 ##also solved warning in previous version
arima_mid.5.m<- Arima(train.mid[,5], order = c(5,2,4), seasonal = c(2,2,2))##BEST AIC
#AIC=1971.84 AICc=1973.72 BIC=2020.52
########################################## Colon
#(arima_mid.6<- auto.arima(train.mid[,6], seasonal= TRUE))
#(arima_mid.5.m<- Arima(train.mid[,6], order = c(2,0,0), seasonal = c(2,1,1))) ##auto.arima model
#AIC=1904.11 AICc=1904.55 BIC=1929.17
#(arima_mid.5.m<- Arima(train.mid[,6], order = c(2,0,2), seasonal = c(2,1,1)))
#AIC=1898.68 AICc=1899.24 BIC=1927.32
#(arima_mid.5.m<- Arima(train.mid[,6], order = c(2,1,2), seasonal = c(2,1,1)))
#AIC=1891.71 AICc=1892.28 BIC=1920.32
#(arima_mid.5.m<- Arima(train.mid[,6], order = c(2,1,2), seasonal = c(2,1,2)))
#AIC=1887.73 AICc=1888.44 BIC=1919.92
#(arima_mid.5.m<- Arima(train.mid[,6], order = c(2,1,2), seasonal = c(2,2,2)))
#AIC=1851.87 AICc=1852.65 BIC=1883.19
arima_mid.6.m<- Arima(train.mid[,6], order = c(2,1,2), seasonal = c(2,3,2)) ##BEST AIC
#AIC=1840.36 AICc=1841.23 BIC=1870.74
fcast_mid.4<-forecast(arima_mid.m)
autoplot(fcast_mid.4)+
autolayer(test.mid[,4])
fcast_mid.5<-forecast(arima_mid.5.m)
autoplot(fcast_mid.5)+
autolayer(test.mid[,5])
fcast_mid.6<-forecast(arima_mid.6.m)
autoplot(fcast_mid.6) +
autolayer(test.mid[,6])
#(arima_mar<- auto.arima(train.mar[,4], seasonal = TRUE))
#(arima_mar.m <- Arima(train.mar[,4], order = c(4,1,0) ,seasonal = c(0,0,2))) #auto.arima model
#AIC=2201.95 AICc=2202.61 BIC=2229.38
#(arima_mar.m <- Arima(train.mar[,4], order = c(4,1,1) ,seasonal = c(0,0,2)))
#AIC=2198.94 AICc=2199.6 BIC=2226.37
#(arima_mar.m <- Arima(train.mar[,4], order = c(4,1,3) ,seasonal = c(0,0,2)))
#AIC=2131.73 AICc=2132.74 BIC=2166.02
#(arima_mar.m <- Arima(train.mar[,4], order = c(4,1,3) ,seasonal = c(0,1,2)))
#AIC=2002.72 AICc=2003.83 BIC=2036.14
#(arima_mar.m <- Arima(train.mar[,4], order = c(4,1,3) ,seasonal = c(0,2,2)))
#AIC=1916.7 AICc=1917.93 BIC=1949.17
arima_mar.m <- Arima(train.mar[,4], order = c(4,1,3) ,seasonal = c(0,2,3))## BEST AIC
#AIC=1915.16 AICc=1916.65 BIC=1950.88
######################################LUNGS
#(arima_mar.5<- auto.arima(train.mar[,5], seasonal = TRUE))
#(arima_mar.5.m<- Arima(train.mar[,5], order = c(5,0,1), seasonal = c(0,0,2)))##auto.arima model
#AIC=1663.09 AICc=1664.1 BIC=1697.42
#(arima_mar.5.m<- Arima(train.mar[,5], order = c(5,0,1), seasonal = c(0,1,2)))
#AIC=1581.21 AICc=1582.11 BIC=1611.33
#(arima_mar.5.m<- Arima(train.mar[,5], order = c(5,0,2), seasonal = c(0,1,2)))
#AIC=1558.89 AICc=1559.99 BIC=1592.36
#(arima_mar.5.m<- Arima(train.mar[,5], order = c(5,0,2), seasonal = c(0,2,2)))
#AIC=1516.86 AICc=1518.08 BIC=1549.38
#(arima_mar.5.m<- Arima(train.mar[,5], order = c(5,0,2), seasonal = c(0,3,2)))
#AIC=1474.06 AICc=1475.43 BIC=1505.54
arima_mar.5.m<- Arima(train.mar[,5], order = c(5,0,3), seasonal = c(0,3,2))##BEST AIC
#AIC=1472.04 AICc=1473.69 BIC=1506.66
############################### Colon
#(arima_mar.6<- auto.arima(train.mar[,6], seasonal = TRUE))
#(arima_mar.6.m<- Arima(train.mar[,6], order = c(0,1,5), seasonal = c(0,0,1))) #auto.arima model
#AIC=1668.67 AICc=1669.33 BIC=1696.1
#(arima_mar.6.m<- Arima(train.mar[,6], order = c(0,1,5), seasonal = c(0,1,1)))
#AIC=1596.2 AICc=1596.75 BIC=1619.59
#(arima_mar.6.m<- Arima(train.mar[,6], order = c(0,1,6), seasonal = c(0,1,1)))
#AIC=1566.62 AICc=1567.34 BIC=1593.36
#(arima_mar.6.m<- Arima(train.mar[,6], order = c(0,1,10), seasonal = c(0,1,4)))
#AIC=1497.4 AICc=1499.89 BIC=1547.53
#(arima_mar.6.m<- Arima(train.mar[,6], order = c(0,1,10), seasonal = c(0,1,5)))
#AIC=1491.06 AICc=1493.89 BIC=1544.54
#(arima_mar.6.m<- Arima(train.mar[,6], order = c(0,1,10), seasonal = c(0,2,5)))
#AIC=1471.18 AICc=1474.32 BIC=1523.13
#(arima_mar.6.m<- Arima(train.mar[,6], order = c(0,1,10), seasonal = c(0,3,5)))
#AIC=1469.12 AICc=1472.65 BIC=1519.38
arima_mar.6.m<- Arima(train.mar[,6], order = c(0,1,10), seasonal = c(0,3,6))##BEST AIC
#AIC=1462.55 AICc=1466.55 BIC=1515.95
fcast_mar.4<-forecast(arima_mar.m)
autoplot(fcast_mar.4) +
autolayer(test.mar[,4])
fcast_mar.5<-forecast(arima_mar.5.m)
autoplot(fcast_mar.5) +
autolayer(test.mar[,5])
fcast_mar.6<-forecast(arima_mar.6.m)
autoplot(fcast_mar.6) +
autolayer(test.mar[,6])
#####################ALL cancer sites
#(arima_terr<-auto.arima(train.terr[,4], seasonal = TRUE, stepwise = FALSE, approximation = FALSE))
arima_terr.m<- Arima(train.terr[,4], order = c(0,0,0), seasonal = c(0,1,0))##auto.arima model
#AIC=347.42 AICc=347.55 BIC=348.95
##BEST AIC
############################### LUNGS
#(arima_terr.5<-auto.arima(train.terr[,5], seasonal = TRUE))
arima_terr.5.m<- Arima(train.terr[,5], order= c(0,0,0), seasonal = c(2,0,0))##auto.arima model
#AIC=283.85 AICc=285.1 BIC=290.29
##BEST AIC
################################# COlon
#(arima_terr.6<-auto.arima(train.terr[,6], seasonal = TRUE))
#> ARIMA(2,1,0)
#> AIC=346.54 AICc=347.29 BIC=351.29
#(arima_terr.6.m<- Arima(train.terr[,5], order= c(2,1,1), seasonal = c(0,0,0)))
#AIC=287.9 AICc=289.19 BIC=294.24
#(arima_terr.6.m<- Arima(train.terr[,5], order= c(2,1,1), seasonal = c(0,1,0)))
#AIC=266.43 AICc=267.86 BIC=272.42
arima_terr.6.m<- Arima(train.terr[,5], order= c(2,1,1), seasonal = c(0,2,0))##BEST AIC
#AIC=258.47 AICc=260.07 BIC=264.07
fcast_terr.4<-forecast(arima_terr.m)
autoplot(fcast_terr.4) +
autolayer(test.terr[,4])
fcast_terr.5<-forecast(arima_terr.5.m)
autoplot(fcast_terr.5) +
autolayer(test.terr[,5])
fcast_terr.6<-forecast(arima_terr.6.m)
autoplot(fcast_terr.6) +
autolayer(test.terr[,6])
##forecast function
f.ON<- function(y, h){forecast(arima_ON.m, h=h)}
##explore Exogenous predictor variables with xreg
#compare RMSE
on.1<-tsCV(ts_ON.2[,4], f.ON, h=2)
sqrt(mean(on.1^2, na.rm= TRUE))
## [1] 84.76661
##[1] 84.76661
sqrt(mean(residuals(f.ON(ts_ON.2, h=2))^2, na.rm=TRUE))
## [1] 30.27293
####[1] 30.27293
######lungs forcast
f.ON.5<-function(y, h){forecast(arima_ON.5.m, h=h)}
on.2<-tsCV(ts_ON.2[,5], f.ON.5, h=2)
sqrt(mean(on.2^2, na.rm= TRUE))
## [1] 19.05896
##[1] 19.05896
sqrt(mean(residuals(f.ON.5(ts_ON.2[,5], h=2))^2, na.rm=TRUE))
## [1] 9.144005
#[1] 9.144005
#####colon
f.ON.6<-function(y, h){forecast(arima_ON.6.m, h=h)}
on.3<-tsCV(ts_ON.2[,6], f.ON.6, h=2)
sqrt(mean(on.3^2, na.rm= TRUE))
## [1] 15.02973
##[1] 15.02973
sqrt(mean(residuals(f.ON.6(ts_ON.2[,6], h=2))^2, na.rm=TRUE))
## [1] 6.616241
###[1] 6.616241
#####################ARIMA
accuracy(fcast_ON.4, test.ON[,4])
## ME RMSE MAE MPE MAPE MASE
## Training set -0.3357554 30.27293 23.43676 -0.5624577 4.55796 0.2358884
## Test set -1.4377948 53.13471 44.33229 -1.4543186 8.54579 0.4461995
## ACF1 Theil's U
## Training set -0.1072211 NA
## Test set -0.3663429 0.4820504
accuracy(fcast_ON.5, test.ON[,5])
## ME RMSE MAE MPE MAPE MASE
## Training set 0.1860213 9.144005 6.693404 -1.711729 9.932586 0.3072607
## Test set -0.7983518 9.947122 7.837666 -3.390988 11.725109 0.3597881
## ACF1 Theil's U
## Training set -0.0137609 NA
## Test set 0.0532865 0.5794236
accuracy(fcast_ON.6, test.ON[,6])
## ME RMSE MAE MPE MAPE MASE
## Training set 0.2270698 6.616241 4.918528 -0.9328313 7.405991 0.2734132
## Test set -1.4757116 9.968210 7.966383 -4.5149638 12.613496 0.4428388
## ACF1 Theil's U
## Training set -0.02254191 NA
## Test set -0.36294868 0.4776273
################TSLM
accuracy(fcast.ON.4, test.ON[,4])
## ME RMSE MAE MPE MAPE MASE
## Training set -2.732470e-15 60.86150 52.47408 -1.3887475 10.37529 0.5281456
## Test set 7.725614e+00 60.60299 54.77697 0.1155038 10.64216 0.5513240
## ACF1 Theil's U
## Training set -0.1614651 NA
## Test set -0.2009413 0.547298
accuracy(fcast.ON.5, test.ON[,5])
## ME RMSE MAE MPE MAPE MASE
## Training set 8.288981e-16 16.97424 13.27611 -5.976449 20.35705 0.6094400
## Test set 1.941021e-01 12.61792 10.04751 -3.399509 15.62458 0.4612312
## ACF1 Theil's U
## Training set 0.2514777 NA
## Test set 0.3068294 0.8082756
accuracy(fcast.ON.6, test.ON[,6])
## ME RMSE MAE MPE MAPE MASE
## Training set -4.091172e-16 12.13703 9.895152 -3.169308 15.19297 0.5500560
## Test set -4.956624e-01 11.21434 9.422390 -3.817739 15.13001 0.5237759
## ACF1 Theil's U
## Training set -0.05051096 NA
## Test set -0.18129412 0.5417806
NOTE: MASE all are <1
#####################ARIMA
accuracy(fcast_QC.4, test.QC[,4])
## ME RMSE MAE MPE MAPE MASE
## Training set -3.402105 81.03498 58.78771 -2.11893804 9.923888 0.4261876
## Test set 5.704239 69.29902 56.19228 -0.05398666 9.875326 0.4073718
## ACF1 Theil's U
## Training set -0.01218758 NA
## Test set -0.16173147 0.5462605
accuracy(fcast_QC.5, test.QC[,5])
## ME RMSE MAE MPE MAPE MASE
## Training set -0.7977578 24.50994 16.87430 -Inf Inf 0.2848762
## Test set -1.0018107 21.84940 16.00479 -3.434945 15.49545 0.2701968
## ACF1 Theil's U
## Training set -0.004871119 NA
## Test set 0.140461623 0.428896
accuracy(fcast_QC.6, test.QC[,6])
## ME RMSE MAE MPE MAPE MASE
## Training set -1.380253 25.09385 13.542098 -4.913560 15.87571 0.3871488
## Test set -2.787314 11.30498 8.680159 -5.462049 12.75794 0.2481531
## ACF1 Theil's U
## Training set -0.002606181 NA
## Test set -0.103577339 0.5255439
################TSLM
accuracy(fcast.QC.4, test.QC[,4])
## ME RMSE MAE MPE MAPE MASE
## Training set 7.460068e-16 104.52210 75.16150 -2.629721 12.907732 0.5448911
## Test set -4.602742e+00 65.33167 53.35346 -2.118524 9.709814 0.3867914
## ACF1 Theil's U
## Training set 0.1988024 NA
## Test set -0.1889286 0.5185937
accuracy(fcast.QC.5, test.QC[,5])
## ME RMSE MAE MPE MAPE MASE
## Training set -9.235016e-16 51.74001 33.52550 -Inf Inf 0.5659859
## Test set -5.286283e+00 27.38949 22.31059 -11.57328 24.18798 0.3766530
## ACF1 Theil's U
## Training set 0.4051380 NA
## Test set -0.0446594 0.5712133
accuracy(fcast.QC.6, test.QC[,6])
## ME RMSE MAE MPE MAPE MASE
## Training set 9.636578e-16 37.34677 20.42861 -8.449642 22.82466 0.5840241
## Test set -5.562955e+00 14.44000 11.31650 -9.841291 17.28957 0.3235223
## ACF1 Theil's U
## Training set 0.2982451 NA
## Test set 0.1847868 0.6478085
#####################ARIMA
accuracy(fcast_BC.4, test.BC[,4])
## ME RMSE MAE MPE MAPE MASE
## Training set -1.527825 14.98964 11.51634 -0.5274741 2.391969 0.1231133
## Test set 12.934360 30.71271 21.95231 2.0028823 4.079514 0.2346772
## ACF1 Theil's U
## Training set -0.01966609 NA
## Test set -0.21532894 0.3046769
accuracy(fcast_BC.5, test.BC[,5])
## ME RMSE MAE MPE MAPE MASE
## Training set -0.2626233 3.532935 2.614178 -0.6773540 3.806916 0.1508118
## Test set 0.5055518 5.768628 3.911727 -0.1145212 5.622419 0.2256673
## ACF1 Theil's U
## Training set -0.03900797 NA
## Test set 0.35746108 0.3048858
accuracy(fcast_BC.6, test.BC[,6])
## ME RMSE MAE MPE MAPE MASE
## Training set 0.4600199 5.503453 4.245021 -0.3776554 6.994721 0.2647857
## Test set 0.1792858 7.680355 6.341853 -1.3306671 10.103317 0.3955768
## ACF1 Theil's U
## Training set 0.02153032 NA
## Test set -0.12049978 0.4498501
################TSLM
accuracy(fcast.BC.4, test.BC[,4])
## ME RMSE MAE MPE MAPE MASE
## Training set 5.540284e-16 57.31390 48.83758 -1.386427 10.21451 0.5220895
## Test set 8.112898e+00 59.73265 52.53303 0.249694 10.59902 0.5615950
## ACF1 Theil's U
## Training set -0.1743779 NA
## Test set -0.1910338 0.5457354
accuracy(fcast.BC.5, test.BC[,5])
## ME RMSE MAE MPE MAPE MASE
## Training set 2.010217e-15 12.58629 10.34324 -3.370094 15.50199 0.5967012
## Test set 3.019499e-01 13.83539 10.87916 -3.181984 15.94336 0.6276181
## ACF1 Theil's U
## Training set 0.1904812 NA
## Test set 0.4598499 0.781279
accuracy(fcast.BC.6, test.BC[,6])
## ME RMSE MAE MPE MAPE MASE
## Training set 6.925355e-17 10.137763 8.143271 -2.8533769 13.95716 0.5079413
## Test set 9.665315e-01 9.440846 8.249721 -0.7821131 13.69748 0.5145811
## ACF1 Theil's U
## Training set -0.1029082 NA
## Test set -0.1487001 0.4998617
#####################ARIMA
accuracy(fcast_mid.4, test.mid[,4])
## ME RMSE MAE MPE MAPE MASE
## Training set 2.846237 24.62107 17.02520 0.3594439 3.180817 0.5353263
## Test set -13.350272 24.62355 21.39882 -2.4703575 4.015196 0.6728470
## ACF1 Theil's U
## Training set -0.008567216 NA
## Test set 0.261144498 0.2343431
accuracy(fcast_mid.5, test.mid[,5])
## ME RMSE MAE MPE MAPE MASE
## Training set 0.04218503 9.173924 6.403709 -0.5009628 8.460447 0.4780637
## Test set -6.69998552 9.846109 7.976579 -9.9995206 11.864569 0.5954851
## ACF1 Theil's U
## Training set -0.02733384 NA
## Test set 0.19375542 0.7186801
accuracy(fcast_mid.6, test.mid[,6])
## ME RMSE MAE MPE MAPE MASE
## Training set -0.2642139 9.413829 6.084825 -0.6310913 8.881973 0.6569747
## Test set 1.8358503 7.256528 5.450974 2.8121503 8.141033 0.5885382
## ACF1 Theil's U
## Training set 0.004247999 NA
## Test set -0.351646059 0.425444
################TSLM
accuracy(fcast.mid.4, test.mid[,4])
## ME RMSE MAE MPE MAPE MASE
## Training set -1.190077e-15 49.64674 42.49145 -0.8846553 8.165776 1.336067
## Test set -2.096376e+01 54.83271 40.74082 -5.0178291 8.442828 1.281021
## ACF1 Theil's U
## Training set -0.2685497 NA
## Test set -0.3980486 0.4919871
accuracy(fcast.mid.5, test.mid[,5])
## ME RMSE MAE MPE MAPE MASE
## Training set -7.071129e-16 15.70369 10.94078 -3.523441 14.09613 0.8167749
## Test set -9.716891e+00 12.99978 10.33062 -15.868500 16.65156 0.7712239
## ACF1 Theil's U
## Training set 0.25195557 NA
## Test set 0.01659973 0.9038385
accuracy(fcast.mid.6, test.mid[,6])
## ME RMSE MAE MPE MAPE MASE
## Training set -1.479317e-16 12.92446 10.767460 -3.434188 16.00222 1.162556
## Test set -9.464619e+00 13.12395 9.855274 -17.589116 18.14554 1.064068
## ACF1 Theil's U
## Training set -0.02792106 NA
## Test set -0.32190182 0.6785667
#####################ARIMA
accuracy(fcast_mar.4, test.mar[,4])
## ME RMSE MAE MPE MAPE MASE
## Training set 0.3748883 26.79171 19.22015 -0.03847801 3.467458 0.1669458
## Test set -1.7818321 45.60905 38.05424 -1.11150570 7.069351 0.3305383
## ACF1 Theil's U
## Training set -0.00384057 NA
## Test set -0.06842116 0.2647759
accuracy(fcast_mar.5, test.mar[,5])
## ME RMSE MAE MPE MAPE MASE
## Training set 0.6745681 10.36877 7.255549 0.3295996 8.282918 0.2955188
## Test set -6.4261880 32.74793 26.085511 -16.0184045 32.872583 1.0624638
## ACF1 Theil's U
## Training set 0.006177642 NA
## Test set -0.384584439 0.5029747
accuracy(fcast_mar.6, test.mar[,6])
## ME RMSE MAE MPE MAPE MASE
## Training set 0.0843151 8.527493 5.868156 -0.5208992 7.831055 0.2892413
## Test set 4.7871196 23.601140 17.650280 3.1251743 25.629737 0.8699821
## ACF1 Theil's U
## Training set -0.08125448 NA
## Test set -0.34709521 1.035569
################TSLM
accuracy(fcast.mar.4, test.mar[,4])
## ME RMSE MAE MPE MAPE MASE
## Training set 1.719709e-15 67.03822 58.60668 -1.436155 10.69647 0.5090563
## Test set -2.785333e+01 84.00296 67.07349 -7.578495 13.58759 0.5825988
## ACF1 Theil's U
## Training set -0.3164878 NA
## Test set -0.3764124 0.499807
accuracy(fcast.mar.5, test.mar[,5])
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.000000 16.23688 13.01931 -3.424512 15.59274 0.5302771 -0.3400050
## Test set 2.736364 27.33734 23.42896 -8.155137 29.66163 0.9542623 -0.4326846
## Theil's U
## Training set NA
## Test set 0.4822365
accuracy(fcast.mar.6, test.mar[,6])
## ME RMSE MAE MPE MAPE MASE
## Training set -1.731306e-15 12.43713 10.51453 -2.609385 14.11454 0.5182612
## Test set -9.998602e+00 14.85506 11.52174 -17.859130 19.63211 0.5679064
## ACF1 Theil's U
## Training set -0.1458899 NA
## Test set -0.3400125 0.5747104
#####################ARIMA
accuracy(fcast_terr.4, test.terr[,4])
## ME RMSE MAE MPE MAPE MASE
## Training set -6.120165 37.28218 28.90686 -1.3954971 5.494962 0.9204283
## Test set 3.516667 38.77106 35.61667 0.0400974 6.799237 1.1340763
## ACF1 Theil's U
## Training set 0.09744538 NA
## Test set -0.23667890 0.3825451
accuracy(fcast_terr.5, test.terr[,5])
## ME RMSE MAE MPE MAPE MASE
## Training set 0.4571086 9.578816 7.40855 -0.7693159 8.82341 0.7928571
## Test set -5.4459145 17.358887 12.83183 -12.0646412 19.72843 1.3732517
## ACF1 Theil's U
## Training set -0.003140624 NA
## Test set -0.486631480 0.5206454
accuracy(fcast_terr.6, test.terr[,6])
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 1.960658 13.43061 10.29128 2.784172 12.18069 1.101364 -0.04762778
## Test set 3.554966 18.44070 16.41814 4.896150 20.91724 1.757056 -0.58049364
## Theil's U
## Training set NA
## Test set 1.217407
################TSLM
accuracy(fcast.terr.4, test.terr[,4])
## ME RMSE MAE MPE MAPE MASE
## Training set 1.535708e-14 48.77680 37.20890 -0.8171954 6.964245 1.184775
## Test set -3.068323e+01 64.19984 50.57348 -7.2725797 10.430766 1.610319
## ACF1 Theil's U
## Training set 0.05067287 NA
## Test set -0.31542376 0.5129719
accuracy(fcast.terr.5, test.terr[,5])
## ME RMSE MAE MPE MAPE MASE
## Training set 9.603335e-16 14.68081 12.68075 -2.962001 15.24828 1.357083
## Test set -8.073052e+00 17.13564 12.37991 -15.555256 19.88131 1.324888
## ACF1 Theil's U
## Training set -0.2043604 NA
## Test set -0.3772719 0.452088
accuracy(fcast.terr.6, test.terr[,6])
## ME RMSE MAE MPE MAPE MASE
## Training set -1.533519e-15 33.73368 23.19238 -6.106848 19.55314 1.183285
## Test set -3.335152e+01 34.99038 33.35152 -42.337592 42.33759 1.701608
## ACF1 Theil's U
## Training set -0.07129035 NA
## Test set -0.17748632 2.061652